install.packages("corpus.JSS.papers", repos = "http://datacube.wu.ac.at/", type = "source")
data("JSS_papers", package = "corpus.JSS.papers")


Sys.setenv(NOAWT=TRUE)

#### libraries
library(tm)
library(slam)
library("topicmodels")
library("XML")

corpus <- Corpus(VectorSource(JSS_papers))

Sys.setlocale("LC_COLLATE", "C")



#### LOAD PRESS RELEASES - working directory should contain folder of press releases
txt <- "/-example-user-path/pressreleases/"
setwd("/-example-user-path/pressreleases/")

#### EXAMPLE:
#txt <- "/Users/hf/Dropbox/henryfolder/replicationfinal/framingpaper-09202017-pressreleases/"
#setwd("/Users/hf/Dropbox/henryfolder/replicationfinal/framingpaper-09202017-pressreleases/")
####

(JSS_papers <- Corpus(DirSource(txt),readerControl = list(language = "eng")))


#### removed corpus from below
JSS_dtm <- DocumentTermMatrix(JSS_papers,
 control = list(stemming = TRUE, stopwords = TRUE, minWordLength = 3,
 removeNumbers = TRUE, removePunctuation = TRUE))
 dim(JSS_dtm)

#### must run to here for comparisions with others
JSSbinary <- as.matrix(JSS_dtm)
v1 <- apply((JSSbinary > 0)*1,2,sum)
v2 <- v1/dim(JSS_dtm)[1]

#### remove words that appear in fewer than 1% of documents 
JSS_dtm2 <- JSS_dtm[,v2 > .01]

#### remove documents that have no mentioned words
v3 <- apply(JSS_dtm2,1,sum)
JSS_dtm3 <- JSS_dtm2[v3 > 0,]

#### reset working directory:
setwd("/-example-user-path/")

save(JSS_dtm3,file="pressrelease09272017.Rdata")

############## CAN START HERE ON UNIX

load("/-example-user-path/pressrelease09272017.Rdata")

#### press releases: get dates
####
#### extract dates

doc.list <- names(JSS_papers)
n.docs <- length(doc.list)

party <- name <- month <- year <- day2 <- day <- c()
for(i in 1:n.docs){
	month[i] <- strsplit(doc.list[i]," - ")[[1]][1]
	year[i] <- strsplit(doc.list[i]," - ")[[1]][3]
	day[i] <- strsplit(doc.list[i]," - ")[[1]][2]
	day2[i] <- strsplit(day[i],",")[[1]][1]
	name[i] <- strsplit(doc.list[i]," - ")[[1]][4]
	party[i] <- strsplit(doc.list[i]," - ")[[1]][6]
}
day3 <- as.numeric(day2)
monthsout <- rep(NA,n.docs)

#### missing alexander date
day3[153] <- day[153] <- 2

monthsout[year %in% c("09","2009") & month %in% c("Jan")] <- 1
monthsout[year %in% c("09","2009") & month %in% c("Feb")] <- 2
monthsout[year %in% c("09","2009") & month %in% c("Mar")] <- 3
monthsout[year %in% c("09","2009") & month %in% c("Apr")] <- 4
monthsout[year %in% c("09","2009") & month %in% c("May")] <- 5
monthsout[year %in% c("09","2009") & month %in% c("Jun")] <- 6
monthsout[year %in% c("09","2009") & month %in% c("Jul")] <- 7
monthsout[year %in% c("09","2009") & month %in% c("Aug")] <- 8
monthsout[year %in% c("09","2009") & month %in% c("Sep")] <- 9
monthsout[year %in% c("09","2009") & month %in% c("Oct")] <- 10
monthsout[year %in% c("09","2009") & month %in% c("Nov")] <- 11
monthsout[year %in% c("09","2009") & month %in% c("Dec")] <- 12

monthsout[year %in% c("010","10","2010") & month %in% c("Jan")] <- 13
monthsout[year %in% c("010","10","2010") & month %in% c("Feb")] <- 14
monthsout[year %in% c("010","10","2010") & month %in% c("Mar")] <- 15
monthsout[year %in% c("010","10","2010") & month %in% c("Apr")] <- 16
monthsout[year %in% c("010","10","2010") & month %in% c("May")] <- 17
monthsout[year %in% c("010","10","2010") & month %in% c("Jun")] <- 18
monthsout[year %in% c("010","10","2010") & month %in% c("Jul")] <- 19
monthsout[year %in% c("010","10","2010") & month %in% c("Aug")] <- 20
monthsout[year %in% c("010","10","2010") & month %in% c("Sep")] <- 21
monthsout[year %in% c("010","10","2010") & month %in% c("Oct")] <- 22
monthsout[year %in% c("010","10","2010") & month %in% c("Nov")] <- 23
monthsout[year %in% c("010","10","2010") & month %in% c("Dec")] <- 24

#### summarize distribution of words
summary(col_sums(JSS_dtm3))

#### summarize length of documents
summary(row_sums(JSS_dtm3))

#####################
#### RUN LDA_GIBBS TOPIC MODEL
#####################

library(topicmodels)
library(slam)
library(xtable)

k <- 12
SEED <- 60178

#### full model:
#### WARNING: takes significant time to complete
#jss_TM_gibbs2 <- list(LDA(JSS_dtm3, k = k, method = "Gibbs",
 #control = list(seed = SEED, burnin = 30000,
 #thin = 1000, iter = 200000)))
 
#### shorter model for testing:
 jss_TM_gibbs2 <- list(LDA(JSS_dtm3, k = k, method = "Gibbs",
  control = list(seed = SEED, burnin = 10,
  thin = 1, iter = 1000)))

save(jss_TM_gibbs2,file="gibbsout100117test_full.Rdata")

load("gibbsout100117test_full.Rdata")

#########################
############### PLOT RESULTS 
#########################

k <- 12

#### lists available output for CTM
slotNames(jss_TM_gibbs2[[1]])

#### lists all documents
doc.list <- sapply(jss_TM_gibbs2[1], slot, "documents")
n.docs <- length(doc.list)

(k-1)*n.docs

p.out1 <- posterior(jss_TM_gibbs2[[1]])
round(apply(p.out1$topics,2,mean),digits=4)

xtable(terms(jss_TM_gibbs2[[1]],20)[,1:6])
xtable(terms(jss_TM_gibbs2[[1]],20)[,7:12])



#### extract dates
party <- name <- month <- year <- day2 <- day <- c()
for(i in 1:n.docs){
	month[i] <- strsplit(doc.list[i]," - ")[[1]][1]
	year[i] <- strsplit(doc.list[i]," - ")[[1]][3]
	day[i] <- strsplit(doc.list[i]," - ")[[1]][2]
	day2[i] <- strsplit(day[i],",")[[1]][1]
	name[i] <- strsplit(doc.list[i]," - ")[[1]][4]
	party[i] <- strsplit(doc.list[i]," - ")[[1]][6]
}
day3 <- as.numeric(day2)
monthsout <- rep(NA,n.docs)

#### missing alexander date
day3[153] <- day[153] <- 2

monthsout[year %in% c("09","2009") & month %in% c("Jan")] <- 1
monthsout[year %in% c("09","2009") & month %in% c("Feb")] <- 2
monthsout[year %in% c("09","2009") & month %in% c("Mar")] <- 3
monthsout[year %in% c("09","2009") & month %in% c("Apr")] <- 4
monthsout[year %in% c("09","2009") & month %in% c("May")] <- 5
monthsout[year %in% c("09","2009") & month %in% c("Jun")] <- 6
monthsout[year %in% c("09","2009") & month %in% c("Jul")] <- 7
monthsout[year %in% c("09","2009") & month %in% c("Aug")] <- 8
monthsout[year %in% c("09","2009") & month %in% c("Sep")] <- 9
monthsout[year %in% c("09","2009") & month %in% c("Oct")] <- 10
monthsout[year %in% c("09","2009") & month %in% c("Nov")] <- 11
monthsout[year %in% c("09","2009") & month %in% c("Dec")] <- 12

monthsout[year %in% c("010","10","2010") & month %in% c("Jan")] <- 13
monthsout[year %in% c("010","10","2010") & month %in% c("Feb")] <- 14
monthsout[year %in% c("010","10","2010") & month %in% c("Mar")] <- 15
monthsout[year %in% c("010","10","2010") & month %in% c("Apr")] <- 16
monthsout[year %in% c("010","10","2010") & month %in% c("May")] <- 17
monthsout[year %in% c("010","10","2010") & month %in% c("Jun")] <- 18
monthsout[year %in% c("010","10","2010") & month %in% c("Jul")] <- 19
monthsout[year %in% c("010","10","2010") & month %in% c("Aug")] <- 20
monthsout[year %in% c("010","10","2010") & month %in% c("Sep")] <- 21
monthsout[year %in% c("010","10","2010") & month %in% c("Oct")] <- 22
monthsout[year %in% c("010","10","2010") & month %in% c("Nov")] <- 23
monthsout[year %in% c("010","10","2010") & month %in% c("Dec")] <- 24



mean2 <- function(x){
	mean(x,na.rm=T)
}


sd2 <- function(x){
	sd(x,na.rm=T)
}

mean(round(apply(p.out1$topics[party=="D.txt",],2,sd2),digits=4))

dems.lda <- round(apply(p.out1$topics[party=="D.txt",],2,mean2),digits=4)
gop.lda <- round(apply(p.out1$topics[party=="R.txt",],2,mean2),digits=4)

mean(abs(dems.lda-gop.lda))


k <- 12
n.months <- max(monthsout,na.rm=TRUE)
n.days <- n.months*28
dmatd <- rmatd <- matrix(NA,n.days,k)

for(i in 1:n.days){

	month.idx <- round(i/28+.4999,digits=0)
	day.within.month.idx <- i-(month.idx-1)*28

	idx1.d <- which((monthsout==month.idx) & (day3 <= day.within.month.idx) & (party=="D.txt"))
	cat("Dem this month",length(idx1.d),"\n")
	idx1.r <- which((monthsout==month.idx) & (day3 <= day.within.month.idx) & (party=="R.txt"))
	cat("GOP this month",length(idx1.r),"\n")

	idx2.d <- which(monthsout==(month.idx-1) & day3 > day.within.month.idx & party=="D.txt")
	cat("Dem last month",length(idx2.d),"\n")
	idx2.r <- which(monthsout==(month.idx-1) & day3 > day.within.month.idx & party=="R.txt")
	cat("GOP last month",length(idx2.r),"\n")

	idx.d <- c(idx1.d,idx2.d)
	idx.r <- c(idx1.r,idx2.r)

	for(j in 1:k){
		rmatd[i,j] <- mean(p.out1$topics[idx.r,j],na.rm=T)
		dmatd[i,j] <- mean(p.out1$topics[idx.d,j],na.rm=T)
	}
}

mn <- 0
mx <- .35

#### reweighting terms by distinctiveness
term.mat <- p.out1$terms
term.freq <- apply(p.out1$terms,2,mean)
names(term.freq) <- colnames(JSS_dtm3)
term.freq.mat <- matrix(NA,k,length(colnames(JSS_dtm3)))
colnames(term.freq.mat) <- colnames(JSS_dtm3)
for(i in 1:k){
	term.freq.mat[i,] <- p.out1$terms[i,]-term.freq
}
sort(term.freq.mat[1,])

n.terms <- 15

mntxt3 <- mntxt2 <- mntxt1 <- c()
for(j in 1:k){
	Terms.gibbs <- names(sort((term.freq.mat[j,]),decreasing=T))
	mntxt1[j] <- paste(Terms.gibbs[1:(n.terms/3)],collapse=",")
	mntxt2[j] <- paste(Terms.gibbs[(n.terms/3+1):(2*n.terms/3)],collapse=",")
	mntxt3[j] <- paste(Terms.gibbs[(2*n.terms/3+1):n.terms],collapse=",")
}


ss <- seq(from=28,to=n.days,by=28*3)
labels <- c("Jan9","Apr9","Jul9","Oct9","Jan10","Apr10","Jul10","Oct10")

#pdf("gibbs12daily063011PR.pdf",width=10)
#postscript("gibbs12dailysm071411.eps",width=10)

par(mfcol=c(3,4))

for(j in 1:k){
plot(dmatd[,j],type="l",ylim=c(mn,mx),col="blue",main=paste(mntxt1[j],"\n",mntxt2[j],"\n",mntxt3[j],sep=""),xlab="Month",ylab="Share",cex.main=1,cex.lab=1.4,xaxt="n")
  par(new=T)
  plot(rmatd[,j],type="l",ylim=c(mn,mx),col="black",main="",xlab="",ylab="",lty=4,xaxt="n",lwd=1.5)
  axis(side=1,at=ss,labels=labels)
  for(l in ss){
    abline(v=l,lty=3,col="gray")
  }	

}

#dev.off()


sd2 <- function(x){
  sd(x,na.rm=T)  
}
mean(apply(dmatd,2,sd2))
mean(apply(rmatd,2,sd2))

mean2 <- function(x){
  mean(x,na.rm=T)
}

p.dist.d <- apply(p.out1$topics[party=="D.txt",],2,mean2)
p.dist.r <- apply(p.out1$topics[party=="R.txt",],2,mean2)

mean(abs(p.dist.d-p.dist.r))

colnames(dmatd) <- colnames(rmatd) <- paste(terms(jss_TM_gibbs2[[1]],1))

#save(dmatd,rmatd,monthsout,party,day3,file="overtimematriciesgibbesPR091812.Rdata")
